#!/usr/bin/env python3
#
# Bug reports and comments to Igor.Ermolaev@intel.com
#
from collections import defaultdict, OrderedDict
from copy import copy
from bisect import bisect, insort
from math import floor, ceil, log
from time import clock
from os.path import split, splitext
import fileinput
import re
import sys
import gzip
import xml.parsers.expat
import hashlib


class extract:
   def __init__( self, *args ): self.rx = re.compile( *args )
   def extr_str( self, line ):
      s = self.rx.match( line )
      return s.group(1) if s is not None else None
   def find_str( self, line ):
      s = self.rx.search( line )
      return s.group(1) if s is not None else None
   def find_int( self, line ):
      s = self.rx.search( line )
      return int( s.group(1) ) if s is not None else None
   def find_float( self, line ):
      s = self.rx.search( line )
      return float( s.group(1) ) if s is not None else None
   def repl_str( self, line, repl ): return self.rx.sub( repl, line )
   def chng_str( self, line, repl ):
      s = self.rx.subn( repl, line )
      return s[0] if s[1] > 0 else None


re_excs = extract( r"\s+n\s*=\s*(\d+)", re.I )
re_uops = extract( r"\s+u\s*=\s*(\d+)", re.I )
re_clks = extract( r"\s+c\s*=\s*(\d+)", re.I )
re_clkp = extract( r"\s+c\s*=\s*(\d+)\(.+\)", re.I )
re_cpi  = extract( r"\s+cpi\s*=\s*(\d+\.\d)", re.I )
re_ipc  = extract( r"\s+ipc\s*=\s*(\d+\.\d)", re.I )
re_frst = extract( r"\s+first\s*=\s*(\d+)", re.I )
re_last = extract( r"\s+last\s*=\s*(\d+)", re.I )


class perf_info:
   phases_prvd = True

   def __init__( self, line = None ):
      if line is not None:
         self.execs = re_excs.find_int( line )
         self.uops  = re_uops.find_int( line )
         self.clks  = re_clks.find_int( line )
         cpi        = re_cpi.find_float( line )
         ipc        = re_ipc.find_float( line )
         self.first = re_frst.find_int( line )
         self.last  = re_last.find_int( line )
         if ipc is not None: assert abs( ipc - self.execs / self.clks  ) < 0.05078125 # representable in binary system
         if cpi is not None: assert abs( cpi - self.clks  / self.execs ) < 0.05078125 # representable in binary system
         if self.clks is not None:
            if perf_info.phases_prvd: self.clks //= 2 # printed as phases ( 2 phases per cycle )
            self.weight = self.clks
         else:
            self.weight = self.execs
      else: self.first, self.last = None, None
   def revise( self, new ):
      self.first = min_or_none( min_or_none( self.first, new.first ), new.last  )
      self.last  = max_or_none( max_or_none( self.last , new.last  ), new.first )
   def print( self ): print( "first={}, last={}".format( self.first, self.last ) )


class edge( int ):
   def __new__( cls, val, target=None, first=None, last=None ): return int.__new__( cls, val )
   def __init__( self, val, target=None, first=None, last=None ): self.target,self.first,self.last = target,first,last
   def __add__( self, other ):
      return edge( int.__add__( self, other.execs ), None, min_or_none( self.first, other.first ), max_or_none( self.last , other.last ) )


class edges( dict ):
   def __missing__( self, key ): return edge(0)


class block( list, perf_info ):
   def __init__( self, instr ):
      perf_info.__init__( self )
      self.prevs = edges()
      self.nexts = edges()
      self.ret   = None
      self.call  = None
      self.start = instr.start
      self.append( instr )
      self.dgst  = int( hashlib.sha256(bytes.fromhex('0'*(len(self.start)&1)+self.start)).hexdigest()[0:5], 16 )

   def revise( self, blocks, rngs=None ):
      if rngs is None:
         self.clks   = None
         self.weight = None
         self.execs  = None
         self.events = None
         min_execs   = None
         for i in self:
            i.revise( blocks )
            perf_info.revise( self, i )
            self.clks   = add_or_none( self.clks, i.clks )
            self.weight = add_or_none( self.weight, i.weight )
            self.execs  = max_or_none( self.execs, i.execs )
            self.events = add_or_none( self.events, i.events )
            min_execs   = min_or_none( min_execs , i.execs )
            self.end    = i.start # "{:08x}".format( int( i.start, 16 ) + i.size )
         assert( self.execs - min_execs <= 1 )
      else:
         if blocks.distrib is not None and blocks.clks is not None:
            for i in self: i.revise( blocks )
         for r, e in self.prevs.items():
            e.target = rngs[ rng(r) ]
            e.target[-1].first = min_or_none( e.target[-1].first, sum_or_none( e.first, -1 ) )
            e.target[-1].last  = max_or_none( e.target[-1].last , sum_or_none( e.last , -1 ) )
            super( block, e.target ).revise( e.target[-1] )
            if e.first is None and e == e.target[-1].execs: e.first = sum_or_none( e.target[-1].first, +1 )
            if e.last  is None and e == e.target[-1].execs: e.last  = sum_or_none( e.target[-1].last , +1 )
            if self.start not in e.target.nexts: e.target.nexts[ self.start ] = edge( e, self, sum_or_none( e.first, -1 ), sum_or_none( e.last, -1 ) )
            else:
               e.target.nexts[self.start].first = min_or_none( e.target.nexts[self.start].first, sum_or_none( e.first, -1 ) )
               e.target.nexts[self.start].last  = max_or_none( e.target.nexts[self.start].last , sum_or_none( e.last , -1 ) )
         for r, e in self.nexts.items():
            e.target = rngs[ rng(r) ]
            e.target[0].first = min_or_none( e.target[0].first, sum_or_none( e.first, +1 ) )
            e.target[0].last  = max_or_none( e.target[0].last , sum_or_none( e.last , +1 ) )
            super( block, e.target ).revise( e.target[0] )
            if e.first is None and e == e.target[0].execs: e.first = sum_or_none( e.target[0].first, -1 )
            if e.last  is None and e == e.target[0].execs: e.last  = sum_or_none( e.target[0].last , -1 )
            if self.end not in e.target.prevs: e.target.prevs[ self.end ] = edge( e, self, sum_or_none( e.first, +1 ), sum_or_none( e.last, +1 ) )
            else:
               e.target.prevs[self.end].first = min_or_none( e.target.prevs[self.end].first, sum_or_none( e.first, +1 ) )
               e.target.prevs[self.end].last  = max_or_none( e.target.prevs[self.end].last , sum_or_none( e.last , +1 ) )
         if 'call' in self[-1].cmd.lower():
            num_addr = int(self[-1].start,16) + self[-1].size
            str_addr = "{:x}".format( num_addr )
            self.ret = rngs[rng(str_addr)]
            if int(self.ret.start,16) != num_addr: self.ret = None
         if self.ret is not None: self.ret.call = self

   def __eq__( self, other ): return self is other

   def __gt__( self, other ): return self.weight > other.weight

   def __hash__( self ): return self.dgst

   def print( self, blocks ):
      print( "\t; n=", self.execs, sep='', end='' )
      rate = float( self.execs ) * len( self ) / blocks.execs
      if rate >= 0.001: print( "[{:.1%}]".format( rate ), end='' )
      print( ", ", end='' )
      if self.clks is not None:
         print( "c=", self.clks, sep='', end='' )
         rate = self.clks / blocks.clks
         if rate >= 0.001: print( "({:.1%})".format( rate ), end='' )
         print( ", ", end='' )
         if self.clks > 0: print( "cpi={:.1f}, ipc={:.1f}, ".format( self.clks / self.execs / len( self ), float( self.execs ) * len( self ) / self.clks ), end='' )
      if self.events is not None:
         rate = self.events / self.execs / len(self)
         if rate >= 0.1: print( "epi={:.1f}, ".format( rate ), end='' )
      perf_info.print( self )
      for i in self: print( i.text, end='' )

   def export( self, cpi_scale=1.0 ):
      for i in self:
         print( '{} {}'.format( i.start, i.execs ), end='' )
         if i.clks is not None: print( ' {}'.format( i.clks if cpi_scale == 1.0 else int(0.5+cpi_scale*i.clks) ), end='' )
         print()


def min_or_none( a, b ): return a if b is None else ( b if a is None else min( a, b ) )
def max_or_none( a, b ): return a if b is None else ( b if a is None else max( a, b ) )
def add_or_none( a, b ): return a if b is None else ( b if a is None else a + b )
def mul_or_none( a, b ): return a * b if a is not None and b is not None else None
def div_or_none( a, b ): return a / b if a is not None and b is not None else None
def sum_or_none( a, b ): return a + b if a is not None and b is not None else None
def round_or_none( a ): return int(a+0.5) if a is not None else None


class event( list, perf_info ):
   def __init__( self ):
      perf_info.__init__( self )
      self.execs = None
      self.clks  = None
   def revise( self, perf ):
      perf_info.revise( self, perf )
      self.execs = add_or_none( self.execs, perf.execs )
      self.clks  = add_or_none( self.clks , perf.clks  )
   def add( self, perf ):
      self.revise( perf )
      self.append( perf )


def insert( text, pos, new ): return text[:pos] + new + text[pos:]


re_cmd  = extract( r"\s*0x[\da-z]+\s*:\s*(\S+)\s+", re.I )
re_code = extract( r"b\s*=\s*0x([\da-z]+)", re.I )
re_0clk = re.compile( r'(,\s*)?(cpi|ipc|c)=\d+(\.\d+)?', re.I )

class instr( defaultdict, perf_info ):
   def __init__( self, start, line ):
      defaultdict.__init__( self, event )
      perf_info.__init__( self, line )
      self.start  = start
      self.cmd    = re_cmd.extr_str( line )
      self.code   = re_code.find_str( line )
      assert self.code is None or len(self.code) & 1 == 0
      self.size   = len(self.code) // 2 if self.code is not None else None
      self.text   = ''

   def __eq__( self, other ): return self is other

   def revise( self, blocks, evt=None ):
      if evt is not None: perf_info.revise( self, evt )
      elif blocks.clks is not None: # second pass
         e1l   = self.text.find("\n")
         line1 = self.text[:e1l]
         lines = self.text[e1l:]
         ratio = float(self.clks)/blocks.clks if self.clks is not None and blocks.clks is not None else 0.0
         newl1 = re_clkp.chng_str( line1, ' c={}({:.1%})'.format( self.clks, ratio ) if self.clks is not None else 'c=inf' )
         if newl1 is not None: line1 = newl1
         else: line1 = re_clks.repl_str( line1, ' c={}({:.1%})'.format( self.clks, ratio ) if ratio >= 0.01 else ' c={}'.format( self.clks ) if self.clks is not None else 'c=inf' )
         self.text = line1+lines
      else:
         if blocks.distrib is not None:
            addr = int(self.start,16)
            idx  = bisect( blocks.distrib, ( addr, ) )
            if idx == len(blocks.distrib): rec = None
            else:
               rec = blocks.distrib[idx]
               if rec[0] != addr: rec = None
            e1l   = self.text.find("\n")
            line1 = self.text[:e1l]
            lines = self.text[e1l:]
            if rec is None:
               self.clks   = int(0.5+blocks.avg_cpi*self.execs)
               self.weight = self.clks
               line1 = re_0clk.sub( '', line1 )
            else:
               self.clks   = rec[2] if rec[1] == self.execs else int(0.5+float(rec[2])/rec[1]*self.execs) if rec[1] != 0 else None
               self.weight = self.clks
               line1 = re_cpi.repl_str( line1, ' cpi={:.1f}'.format( float(rec[2])/rec[1] ) if rec[1] != 0 else 'cpi=inf' )
               line1 = re_ipc.repl_str( line1, ' ipc={:.1f}'.format( float(rec[1])/rec[2] ) if rec[2] != 0 else 'ipc=inf' )
            self.text = line1+lines
         if len(self) == 0: self.events = None
         else:
            self.events = sum( e.execs for e in self.values() )
            rate = self.events / self.execs
            if rate > 0.1: self.text = insert( self.text, self.text.find( "\n" ), ", epi={:.1f}".format( rate ) )
         for r in ( ( "dcu", ( "dcu_miss", "l1d_m" ) ), ( "l2", ( "l2_miss", "l2_m" ) ), ( "dtlb", ( "dtlb_miss", "dtlb_m" ) ), ( "stl", ( "l1d_stl_known", "l1d_stl_unkwn", "dpf" ) ) ):
            rate = sum( self[e].execs for e in r[1] if e in self ) / self.execs
            if rate > 0.1: self.text = insert( self.text, self.text.find( "\n" ), ", {}={:.1%}".format( r[0], rate ) )


class group( list, perf_info ):
   def __init__( self, start ):
      perf_info.__init__( self )
      self.start  = start
      self.callee = {}


class func( list, perf_info ):
   def __init__( self, start, name, module ):
      perf_info.__init__( self )
      self.start  = start
      self.name   = name
      self.module = module
      self.instrs = []
   def revise( self ):
      self.clks  = None
      self.instn = None
      for b in self:
         perf_info.revise( self, b )
         self.clks  = add_or_none( self.clks , b.clks )
         self.instn = add_or_none( self.instn, b.execs * len( b ) )
      self.weight = self.clks if self.clks is not None else self.instn
   def __lt__( self, other ): return self.weight < other.weight


class funcs( dict, perf_info ):
   def __init__( self ):
      perf_info.__init__( self )
      self.modules = dict()
      self.groups  = []
   def set_group( self, start ):
      self.groups.append( group( start ) )
      return self.groups[-1]
   def new( self, start, name, module ):
      self.groups[-1].append( func( start, name, module ) )
      if name is not None: self.modules.setdefault( name, {} )[ module if module is not None else '' ] = self.groups[-1][-1]
      return self.setdefault( start, self.groups[-1][-1] )
   def revise( self ):
      self.clks   = None
      self.weight = None
      for f in self.values():
         f.revise()
         perf_info.revise( self, f )
         self.clks   = add_or_none( self.clks  , f.clks   )
         self.weight = add_or_none( self.weight, f.weight )


class rng:
   def __init__( self, start, end=None ): self.start = int(start,16);self.end = int(end,16) if end is not None else self.start;assert(self.start<=self.end)
   def __lt__( self, other ): return self.start < other.start
   def __str__( self ): return '{} {}'.format( self.start, self.end )


class ranges( list ):
   def __init__( self ): self.list_mode = False
   def __enter__( self ): self.list_mode = True; return self
   def __exit__( self, exc_type, exc_value, traceback ): self.list_mode = False
   def __getitem__( self, key ):
      if self.list_mode: return list.__getitem__( self, key )
      else:
         with self as lst:
            idx = bisect( lst, ( key, ) )
            return list.__getitem__( lst, idx-1 )[1] if idx == len(lst) or key < list.__getitem__( lst, idx )[0] else list.__getitem__( lst, idx )[1]
   def __setitem__( self, key, value ):
      if self.list_mode: list.__setitem__( self, key, value )
      else:
         with self as lst:
            if not lst or list.__getitem__( lst, -1 )[0] < key: lst.append( ( key, value ) )
            else: insort( lst, ( key, value ) )


class triad( perf_info ):
   def __init__( self, prev, blck, next ):
      perf_info.__init__( self )
      self.clks, self.execs, self.weight, self.prev, self.blck, self.next = None, None, None, prev, blck, next
      self.revise()
   def __lt__( self, other ): return self.weight < other.weight
   def revise( self ):
      for b in ( self.prev, self.blck, self.next ):
         if b is not None:
            self.weight = add_or_none( self.weight, b.weight )
            self.clks   = add_or_none( self.clks  , b.clks   )
            self.execs  = add_or_none( self.execs , float( b.execs ) * len( b ) )
            perf_info.revise( self, b )
   def print( self, blocks ):
      print( "\t; ", end='' )
      rate = float( self.execs ) / blocks.execs
      if rate >= 0.001: print( "n=[{:.1%}], ".format( rate ), end='' )
      if self.clks is not None:
         print( "c=", self.clks, sep='', end='' )
         rate = self.clks / blocks.clks
         if rate >= 0.001: print( "({:.1%})".format( rate ), end='' )
         print( ", cpi={:.1f}, ipc={:.1f}, ".format( self.clks / self.execs, float( self.execs ) / self.clks ), end='' )
      perf_info.print( self )
      if self.prev is not None: print( ";### Block A                    ", end='' ), self.prev.print( blocks )
      if self.blck is not None: print( ";### Block B                    ", end='' ), self.blck.print( blocks )
      if self.next is not None: print( ";### Block C                    ", end='' ), self.next.print( blocks )


class triads( list ):
   def append( self, tr ): insort( self, tr )


class variant( float ):
   def __new__( cls, val1, val2 ): return float.__new__( cls, val1 if val1 == val2 else ( val1 + val2 ) / 2 )
   def __init__( self, val1, val2 ): self.min, self.max = min( val1, val2 ), max( val1, val2 )
   def __bool__( self ): return self.min != self.max
   def __format__( self, format_spec ): return '~' + float.__format__( self, format_spec ) if self else float.__format__( self, format_spec )


class path( list, perf_info ):
   max_depth = sys.getrecursionlimit()

   def __init__( self, **kwargs ):
      perf_info.__init__( self )
      self.ret_alwd = True
      self.ret_excl = None
      self.pcl_alwd = False
      self.prt_alwd = False
      self.find_path_set( **kwargs )
      
   def __enter__( self ): self.depth += 1;return self
   
   def __exit__( self, exc_type, exc_value, traceback ): self.depth -= 1
   
   def find_seq_path( self, cur_path ):
      with self:
         bst_path,max_edge,thr_edge,eout_alwd = None,None,0.0,True
         for e in cur_path[-1].nexts.values():
            if e.target is cur_path[0]: bst_path,max_edge,thr_edge = cur_path,e,0.99*e
            elif len(e.target.nexts) == 1 and next(iter(e.target.nexts.values())).target is cur_path[0]: eout_alwd = False
         if bst_path is not None and eout_alwd: return bst_path
         if self.depth < path.max_depth-50:
            reject = self.stops.union( self.term ).union( cur_path )
            for e in sorted( cur_path[-1].nexts.values(), reverse=True ):
               if e > thr_edge:
                  new_path = None
                  if e.target in reject:
                     if cur_path[-1].ret is None and any( c.target.ret is not None for c in e.target.prevs.values() ) and e.target not in self.term and e.target not in self.stops:
                        sub_path,depth = self.ends.get( cur_path[-1], ( None, path.max_depth ) )
                        if sub_path is None and depth == path.max_depth:
                           self.ends[ cur_path[-1] ] = ( None, self.depth )
                           sub_path = self.find_sub_path( ( cur_path[0], cur_path[-1] ), term=copy(self.term), ret_alwd=True, prt_alwd=True )
                           self.ends[ cur_path[-1] ] = ( sub_path, self.depth )
                        if sub_path is not None: new_path = cur_path + sub_path[2:]   
                  else:
                     new_path = self.find_path( cur_path + ( e.target, ) )
                  if new_path is not None:
                     if max_edge is None: bst_path,max_edge,thr_edge = new_path,e,0.99*e
                     elif e > thr_edge:
                        f0   = min_or_none( max_edge.first, max_edge.last )
                        l0   = max_or_none( max_edge.first, max_edge.last )
                        f1   = min_or_none(        e.first,        e.last )
                        l1   = max_or_none(        e.first,        e.last )
                        diff = +1
                        if f0 is not None:
                           if f1 is None: diff = -1
                           else:
                              diff = (l1-f1) - (l0-f0)
                              if diff == 0: diff = f0 - f1
                        else:
                           if f1 is None: diff = f0 - f1
                        if diff == 0: diff = e - max_edge
                        if diff > 0: bst_path,max_edge,thr_edge = new_path,e,0.99*e
                  else: self.term.add( e.target )
         return bst_path

   def find_sub_path( self, cur_path, **kwargs ):
      with self:
         save_vars = ( self.term, self.ret_alwd, self.ret_excl, self.pcl_alwd, self.prt_alwd )
         self.ret_alwd,self.ret_excl,pcl_alwd,prt_alwd=False,None,False,False
         self.find_path_set( **kwargs )
         cur_path  = self.find_seq_path( cur_path )
         self.term,self.ret_alwd,self.ret_excl,self.pcl_alwd,self.prt_alwd = save_vars
         return cur_path

   def process_calls( self, cur_path ):
      with self:
         while cur_path[-1].ret is not None:
            if self.pcl_alwd:
               sub_path = self.find_sub_path( cur_path, term=copy(self.term), pcl_alwd=True )
               if sub_path is not None: return sub_path
            if cur_path[-1].ret in self.term: return None
            sub_path,depth = self.fncs.get( cur_path[-1], ( None, path.max_depth ) )
            if sub_path is None and depth == path.max_depth:
               self.fncs[ cur_path[-1] ] = ( None, self.depth )
               sub_path = self.find_sub_path( ( cur_path[-1].ret, cur_path[-1] ), term=set(), ret_excl=cur_path[-1].ret )
               self.fncs[ cur_path[-1] ] = ( sub_path, self.depth )
            if sub_path is None: return None
            if sub_path[0] is cur_path[0]: return cur_path + sub_path[2:]
            cur_path = cur_path + sub_path[2:] + sub_path[0:1]
         return cur_path

   def find_path( self, cur_path ):
      if clock() >= self.time: return None
      with self:
         if cur_path[-1].call is not None and ( len(cur_path) == 1 or 'ret' in cur_path[-2][-1].cmd.lower() ):
            if self.ret_alwd and cur_path[-1] is self.ret_excl or not self.ret_alwd and cur_path[-1] is not self.ret_excl: return None
            sub_path,depth = self.ends.get( cur_path[-1].call, ( None, path.max_depth ) )
            if sub_path is None and depth == path.max_depth:
               self.ends[ cur_path[-1].call ] = ( None, self.depth )
               sub_path = self.find_sub_path( ( cur_path[0], cur_path[-1].call ), term=copy(self.term) )
               self.ends[ cur_path[-1].call ] = ( sub_path, self.depth )
            if sub_path is not None:
               cur_path = self.process_calls( cur_path )
               if cur_path is not None:
                  mid_path = self.find_sub_path( ( sub_path[1], cur_path[-1] ), term=copy(self.term) )
                  if mid_path is not None:
                     sub_path = mid_path[2:] + sub_path[1:]
                     self.ends[ cur_path[-1] ] = ( sub_path, self.depth )
                     return cur_path + sub_path
               else: return None
            if not self.prt_alwd: return None
         cur_path = self.process_calls( cur_path )
         return self.find_seq_path( cur_path ) if cur_path is not None else None
      
   def find_path_set( self, **kwargs ):
      self.reset = False
      for var,val in kwargs.items(): vars(self)[var] = val
      if self.reset: self.ends,self.fncs,self.term = dict(),dict(),set()
 
   def find_path_run( self, cur_path, **kwargs ):
      self.depth = 0
      self.find_path_set( **kwargs )
      depth      = self.depth
      self.time  = clock() + 2*60
      cur_path   = self.find_path( cur_path )
      assert self.depth == depth
      return cur_path
 
   def body( self ):
      depth, prev = 0, None
      for b in self:
         if prev is not None:
           if prev.ret is not None: depth += 1
           if b.call is not None and 'ret' in prev[-1].cmd.lower(): depth -= 1
         if depth == 0: yield b
         if depth < 0: break
         prev = b
 
   def revise_single( self, blocks ):
      self.revise( blocks )
 
      weight      = self.weight
      self.weight = None
      self.clks   = None
      self.execs  = None

      inside = frozenset(self)
      prev   = None
      depth  = 0
      base   = 0
      stat   = []
      idxs   = [ ( dict(), dict() ) ]
      for i,b in enumerate(self):
         if prev is not None:
            if prev.ret is not None: depth += 1;idxs.append( ( dict(), dict() ) )
            if b.call is not None and 'ret' in prev[-1].cmd.lower(): depth -= 1;idxs.pop()
         if depth < base:
            assert( len(idxs) == 0 )
            base = depth
            idxs.append( ( dict(), dict() ) )
 
         idxs[-1][0][b] = len(stat)
         stat.append( [ len(b),div_or_none(b.weight,b.execs),div_or_none(b.clks,b.execs) ] )
         self.execs  = add_or_none( self.execs , stat[-1][0] )
         self.weight = add_or_none( self.weight, stat[-1][1] )
         self.clks   = add_or_none( self.clks  , stat[-1][2] )
         print(i,':',depth,b.execs,stat[-1])
         for e in b.prevs.values():
            if e.target not in inside: print('loose enter:',b.start,'<-',e.target.start)
            if len(e.target.nexts) == 1 and ( b.call is None or 'ret' not in e.target[-1].cmd.lower() ) and e.target.ret is None and e.target not in inside:
               print('suspected large loops start:',b.start,'<-',e.target.start,e)
               idxs[-1][1][e] = i
         exits = 0
         for e in b.nexts.values():
            if e.target.call is None or 'ret' not in b[-1].cmd.lower():
               l = idxs[-1][0].get(e.target)
               if l is None and e.target not in inside:
                  s = idxs[-1][1].get(e)
                  if s is not None: l = s
               if l is not None:
                  if exits == 0:
                     for n in b.nexts.values():
                        if n.target not in idxs[-1][0] and n.target in inside: exits += n
                  if exits == 0: continue
                  tc = float(e)/exits
                  print("'->",l,e,tc,exits)
                  execs = self.execs
                  for j in range(l,len(stat)):
                     extra = tuple( mul_or_none( stat[j][k], tc ) for k in range(3) )
                     for k in range(3): stat[j][k] = add_or_none( stat[j][k], extra[k] )
                     self.execs  = add_or_none( self.execs , extra[0] )
                     self.weight = add_or_none( self.weight, extra[1] )
                     self.clks   = add_or_none( self.clks  , extra[2] )
                  print('   ',self.execs,self.execs-execs,min(n.first for n in b.nexts.values() if n.target is self[i+1])-b[-1].first,self[l][0].last-min(p.last for p in self[l].prevs.values() if p.target is self[l-1]))
            if e.target not in inside: print('loose exit:',b.start,'->',e.target.start)
         prev = b
 
      self.instn = div_or_none( weight, self.weight )
      if self.instn is None: self.instn = 1
      self.execs  = mul_or_none( self.execs , self.instn )
      self.weight = mul_or_none( self.weight, self.instn )
      self.clks   = mul_or_none( self.clks  , self.instn )
       
         
   def revise( self, blocks ):
      for b in self.body():
         perf_info.revise( self, b )

      depth, base, prev, stack = 0, 0, None, []
      for b in self:
         if prev is None:
            stack.append( [ None, b.execs, None, None, None, b.execs ] )
         else:
            if b.call is not None and 'ret' in prev[-1].cmd.lower():
               if depth <= base:
                  depth -= 1
                  stack[-1][0] = max_or_none( stack[-1][0], b.execs    )
                  stack[-1][1] = max_or_none( stack[-1][1], prev.execs )
                  stack.append( [ prev.execs, b.execs, None, None, None, b.execs ] )
            if prev.ret is not None:
               if depth < base: base = depth
               depth += 1
               stack.append( [ prev.execs, b.execs, None, None, None, b.execs ] )

         stack[-1][2] = add_or_none( stack[-1][2], b.weight         )
         stack[-1][3] = add_or_none( stack[-1][3], b.clks           )
         stack[-1][4] = add_or_none( stack[-1][4], b.execs * len(b) )
         stack[-1][5] = b.execs

         if b.call is not None and prev is not None and 'ret' in prev[-1].cmd.lower():
            if depth > base:
               depth -= 1
               stack[-1][1] = max_or_none( stack[-1][1], prev.execs )
               for i in range(2,5): stack[-2][i] = add_or_none( stack[-2][i], div_or_none( mul_or_none( stack[-1][i], stack[-1][0] ), stack[-1][1] ) )
               stack[-2][5] = div_or_none( mul_or_none( stack[-1][5], stack[-1][0] ), stack[-1][1] )
               stack.pop()
         prev = b
      while len(stack) > 1:
         for i in range(2,5): stack[-2][i] = add_or_none( stack[-2][i], div_or_none( mul_or_none( stack[-1][i], stack[-1][0] ), stack[-1][1] ) )
         stack[-2][5] = div_or_none( mul_or_none( stack[-1][5], stack[-1][0] ), stack[-1][1] )
         stack.pop()
      if stack[-1][0] is None: stack[-1][0] = stack[-1][1]
      self.instn  = stack[-1][0]
      self.weight = round_or_none( div_or_none( mul_or_none( stack[-1][2], stack[-1][0] ), stack[-1][1] ) )
      self.clks   = round_or_none( div_or_none( mul_or_none( stack[-1][3], stack[-1][0] ), stack[-1][1] ) )
      self.execs  = round_or_none( div_or_none( mul_or_none( stack[-1][4], stack[-1][0] ), stack[-1][1] ) )

   def print( self, blocks, hdr_rqrd=False ):
      if hdr_rqrd:
         print( "\t; n={:.1f}, ipp={:.1f}, ".format( self.instn, float(self.execs)/self.instn ), end='' )
         if self.clks is not None and self.clks > 0:
            print( "cpp={:.1f}, ".format( float(self.clks)/self.instn ), end='' )
         print( "i={:.0f}".format( self.execs ), end='' )
         rate = float( self.execs ) / blocks.execs
         if rate >= 0.001: print( "({:.1%})".format( rate ), end='' )
         print( ", ", sep='', end='' )
         if self.clks is not None and self.clks > 0: 
            print( "c={:.0f}".format( self.clks ), end='' )
            rate = self.clks / blocks.clks
            if rate >= 0.001: print( "({:.1%})".format( rate ), end='' )
            print( ", cpi={:.1f}, ipc={:.1f}, ".format( self.clks / self.execs, float( self.execs ) / self.clks ), end='' )
         perf_info.print( self )
      for i, b in enumerate( self ): print( ";### Block {:2}                   ".format( i ), end='' ), b.print( blocks )
 
 
class loop( path ):
   def __init__( self, triad ):
      path.__init__( self )
      self.prevs = ()
      self.nexts = ( max_or_none( triad.prev, max_or_none( triad.blck, triad.next ) ), )

   def are_another_exits( self, blk ):
      if len(self) == 1 or blk in self or blk in self.exits: return []
      term      = set( self.exits )
      exits     = []
      outside   = None
      save_vars = ( self.ret_alwd, self.ret_excl )
      self.find_path_set( ret_alwd=False, ret_excl=None )
      while True:
         path = self.find_path_run( ( blk, self[0] ), term=term.union(exits) )
         if path is None: break
         if outside is None:
            mexit = self.find_path_run( ( blk, self[-1] ), term=set( ( self[0], ) ) )
            if mexit is not None: outside = frozenset( mexit )
         if outside is None: exits.append( blk ); break
         lst_inside, nxt_outside = self[0], None
         for b in path[2:]:
            if nxt_outside is None: nxt_outside = b
            if b not in outside: lst_inside, nxt_outside = b, None
         exits.append( lst_inside )
         if nxt_outside is not None: term.add( nxt_outside )
      self.ret_alwd, self.ret_excl = save_vars
      return exits

   def set_first( self, idx, blocks ):
      if idx != 0:
         tmp = self[:idx]
         del self[:idx]
         self.extend( tmp )
      
      self.exits = [ self[-1] ]
      for e in self[0].prevs.values(): self.exits.extend( self.are_another_exits( e.target ) )
      self.inside = frozenset(self).union(self.exits)

      self.prevs = tuple( e.target for e in self[0].prevs.values()  if e.target not in self.inside )
      self.nexts = tuple( e.target for e in self[-1].nexts.values() if e.target not in self.inside )

      self.revise( blocks )

   def grow( self, blocks ):
      if self.nexts: self.append( max( self.nexts ) )
      self[:] = self.find_path_run( tuple( self ), stops=frozenset( b for l in blocks.loops for b in l ), reset=True )
      start, min_enters, min_exits, max_tc = None, None, None, None
      for idx, b in enumerate( self ):
         cur_enters = sum( e for e in b.prevs.values()           if e.target not in self )
         cur_exits  = sum( e for e in self[idx-1].nexts.values() if e.target not in self )
         cur_tc = ( float( max( b.execs, self[idx-1].execs ) )
                    / ( abs( cur_enters - cur_exits ) if cur_enters != cur_exits else 1 )
                    / ( abs( b.execs - self[idx-1].execs ) if b.execs != self[idx-1].execs else 1 )
                    / self.count( b ) )
         if self[idx-1].ret is not None or b.call is not None or int( b.start, 16 ) > int( self[idx-1].start, 16 ): cur_enters, cur_exits = 0, 0
         if ( min_enters is None
            or min_enters + min_exits == 0 and ( cur_enters + cur_exits != 0 or cur_tc > max_tc or cur_tc == max_tc and int( b.start, 16 ) < int( self[start].start, 16 ) )
            or cur_enters != 0 and cur_exits != 0
               and ( min_enters == 0 or min_exits == 0 or cur_tc > max_tc or cur_tc == max_tc and int( b.start, 16 ) < int( self[start].start, 16 ) )
            or ( cur_enters == 0 or cur_exits == 0 ) and cur_enters + cur_exits != 0 and ( min_enters == 0 or min_exits == 0 )
               and ( cur_tc > max_tc or cur_tc == max_tc and int( b.start, 16 ) < int( self[start].start, 16 ) ) ):
            start, min_enters, min_exits, max_tc = idx, cur_enters, cur_exits, cur_tc
      self.set_first( start, blocks )

   def find_outter( self, cur_path ):
      if clock() >= self.time: return None
      with self:
         if self.depth >= path.max_depth - 50: return None
         if len(vars(cur_path[-1].target)[self.sel]) == 1:
            t = next(iter(vars(cur_path[-1].target)[self.sel].values())).target
            if t in self.term: return cur_path[-1].first if ( len(cur_path) <= 2 or len(vars(cur_path[-2].target)[self.sel]) == 1 ) and t in self else None
            if t in self: return cur_path[-1].first
         max_e = None
         max_f = None
         min_e = None
         seen_body = False
         for e in ( e for e in vars(cur_path[-1].target)[self.sel].values() if e.first is not None and
                     self.opt( e.first, cur_path[-1].first ) == e.first and
                     e.target not in self.term ):
            if e.target in self:
               seen_body = True
            elif len(cur_path) == 1 or abs( e.first - cur_path[-1].first ) == len(cur_path[-1].target):
               for t in reversed(cur_path):
                  if t.target is e.target: return None
               first = self.find_outter(cur_path+(e,)) if e.target.call is None and 'ret' not in e.target[-1].cmd.lower() else None
               if first == e.first: seen_body = True
               else:
                  if max_e is None or e.first != max_e.first and self.opt( e.first, max_e.first ) == max_e.first: max_e,max_f = e,first
            else:
               if min_e is None or e.first != min_e.first and self.opt( e.first, min_e.first ) == min_e.first: min_e = e
         if not seen_body:
            return max_e.first if max_e is not None else min_e.first if min_e is not None else None
         else:
            if max_e is None:
               if min_e is not None: return min_e.first
               else: return cur_path[-1].first
            else:
               return max_f if max_f is not None else max_e.first

   def revise( self, blocks ):
      path.revise( self, blocks )

      revr = []

      self.find_path_set( sel='nexts', opt=max_or_none, exit=None, depth=0, term=set() )
      max_fst   = None
      self.time = clock() + 2*60
      for b in self.body():
         self.term.add(b)
         first = sum_or_none( self.find_outter((edge(b.execs,b,b.first-1),)), +1 )
         if first != b.first: self.exit = min_or_none( self.exit, first )
         max_fst = max_or_none( max_fst, max( i.first for i in b if i.first is not None  ) )
         revr.append(b)
      assert self.depth == 0
      if self.exit is None: self.exit = min(blocks.last,max_fst+1)

      self.find_path_set( sel='prevs', opt=min_or_none, enter=None, term=set() )
      min_fst   = None
      self.time = clock() + 2 * 60
      for b in reversed(revr):
         self.term.add(b)
         first = sum_or_none( self.find_outter((edge(b.execs,b,b.first+1),)), -1 )
         if first != b.first: self.enter = max_or_none( self.enter, first )
         min_fst = min_or_none( min_fst, min( i.first for i in b if i.first is not None  ) )
      assert self.depth == 0
      if self.enter is None: self.enter = max(blocks.first,min_fst-1)

      if self.enter >= self.exit: self.enter = max(blocks.first,min_fst-1)

      eplg = sum( e for b in self.exits for e in b.nexts.values() if e.target not in self.inside )
      prlg = sum( e for e in self[0].prevs.values() if e.target not in self.inside and ( eplg != 0 or int( e.target.start, 16 ) < int( self[0].start, 16 ) ) )
      fst  = self[0][0].execs
      lst  = sum( b[-1].execs for b in self.exits ) - sum( e for b in self.exits for e in b.nexts.values() if e.target in self.inside and e.target is not self[0] )
      no_prlg, no_eplg = prlg == 0, eplg == 0
      if self.first < min( tuple( e.target.first for e in self[0].prevs.values() if e.target not in self.inside ) + ( blocks.last + 1, ) ):
         prlg += 1
         if self[0][0].first is None or self[0][0].first != self.first: fst += 1
      if self.last > max( tuple( e.target.last for e in self[-1].nexts.values() if e.target not in self.inside ) + ( blocks.first - 1, ) ):
         eplg += 1
         if self[-1][-1].last is None or self[-1][-1].last != self.last: lst += 1
      if no_prlg and lst != 0 or fst == 0: prlg, fst = eplg, lst
      if no_eplg and fst != 0 or lst == 0: eplg, lst = prlg, fst

      w, ref_start = 0, 1
      for i in ( i for b in self.body() for i in b ):
         if ref_start > 0 and i.weight is not None: w += i.weight
         if i.first is not None and i.first == self.first:
            ref_start -= 1
         if i.last is not None and i.last == self.last:
            ref_start += 1
      if float(w) / self.weight >= 0.5 and fst > 1 and lst > 1: fst -=1; lst -= 1

      self.tc    = variant( fst / prlg, lst / eplg )
      self.instn = variant( prlg, eplg )
      self.ipi   = self.execs / self.instn / self.tc

   def print( self, blocks ):
      print( "\t; n={:.0f}".format( self.instn ), end='' )
      rate = float( self.execs ) / blocks.execs
      if rate >= 0.001: print( "[{:.1%}]".format( rate ), end='' )
      print( ", ", sep='', end='' )
      if self.clks is not None: print( "cpl={:.1f}, ".format( self.clks / self.instn ), end='' )
      print( "tc={:.1f}, ipi={:.1f}, ".format( self.tc, self.ipi ), end='' )
      if self.clks is not None:
         print( "c=", self.clks, sep='', end='' )
         rate = self.clks / blocks.clks
         if rate >= 0.001: print( "({:.1%})".format( rate ), end='' )
         print( ", cpi={:.1f}, ipc={:.1f}, ".format( self.clks / self.execs, float( self.execs ) / self.clks ), end='' )
      perf_info.print( self )
      if self.prevs: print( ";### Prolog                     ", end='' ), max( self.prevs ).print( blocks )
      path.print( self, blocks )
      if self.nexts: print( ";### Epilog                     ", end='' ), max( self.nexts ).print( blocks )

   def export( self, blocks ):
      print( "0x{}:{}:{}:{}:{}".format( self[0][0].start, self.instn, float(self.execs) / blocks.execs, self.tc, self.ipi ), end='' )
      if self.clks is not None: print( ":{}:{}".format( self.clks / self.instn, self.clks / blocks.clks ), end='' )
      print( '' )


class blocks( list, perf_info ):
   def __init__( self ):
      perf_info.__init__( self )
      self.distrib = None
      self.avg_cpi = 0.0

   def new( self, instr, func ):
      self.append( block( instr ) )
      func.append( self[-1] )
      return self[-1]

   def revise( self, scope, loops=None ):
      self.sort( key=lambda b:int(b.start,16) )
      if self.distrib is not None:
         dinst = 0
         dclks = 0
         for rec in self.distrib:
            dinst += rec[1]
            dclks += rec[2]
         self.avg_cpi = float(dclks)/float(dinst)
      stats = [None,None,None]
      self.weight,self.clks,self.execs = stats
      self.rngs = ranges()
      for b in self:
         b.revise( self )
         perf_info.revise( self, b )
         stats[0] = add_or_none( stats[0], b.weight       )
         stats[1] = add_or_none( stats[1], b.clks         )
         stats[2] = add_or_none( stats[2], b.execs*len(b) )
         self.rngs[rng(b.start,b.end)] = b
      self.weight,self.clks,self.execs = stats
      for b in self: b.revise( self, self.rngs )
      self.triads = triads()
      for b in scope:
         prev, next = None, None
         for t in ( self.rngs[ rng(r) ] for r in b.nexts ):
            if t is not b and ( next is None or t > next ): next = t
         for t in ( self.rngs[ rng(r) ] for r in b.prevs ):
            if t is not b and t is not next and ( prev is None or t > prev ): prev = t
         self.triads.append( triad( prev, b, next ) )
      self.loops = []
      fails = 0
      if loops is not None:
         ts = ( triad( None, self.rngs[ rng(r) ], None ) for r in loops )
      else:
         ts = reversed( self.triads )
      for t in ts:
         b = max_or_none( t.prev, max_or_none( t.blck, t.next ) )
         if all( b not in l for l in self.loops ):
            try:
               lp = loop( t )
               lp.grow( self )
               if len(lp) == 1 and len(lp[0]) == 3 and any( lp[0][0].cmd.lower().startswith( i ) for i in ( 'vgather', 'scatter' ) ): # KNC gather/scatter detected
                  up = copy( lp )
                  try:
                     up.grow( self )
                     lp = up
                  except: pass
               if loops is not None: lp.set_first( lp.index(t.blck), self ) # will raise exception if loop doesn't have this address, but it's ok
               self.loops.append( lp )
               fails = 0
               if not all_loops and len(self.loops) >= 3: break
            except TypeError:
               if self.loops:
                  fails += 1
                  if fails >= skp_loops: break
      self.loops.sort( key=lambda l: l.weight, reverse=True )


class instrs( list ):
   def new( self, start, line ):
     self.append( instr( start, line ) )
     return self[-1]


class thread:
   def __init__( self ):
      self.funcs  = funcs()
      self.blocks = blocks()
      self.instrs = instrs()
   def revise( self, func=None, loops=None ):
      fs = self.funcs.modules.get( func )
      if fs is not None:
         scope = []
         for f in fs.values(): scope.extend( f )
         self.blocks.revise( scope )
      else:
         self.blocks.revise( self.blocks, loops )
      self.funcs.revise()
      self.clks,self.execs,self.weight = self.blocks.clks,self.blocks.execs,self.blocks.weight
      

class threads( list ):
   def __init__( self ):
      self.focus = 0
   def revise( self, func, loops ):
      stats = [None,None,None]
      for t in threads:
         t.revise( func, loops if t is self[self.focus] else None )
         stats[0] = add_or_none( stats[0], t.weight )
         stats[1] = add_or_none( stats[1], t.clks   )
         stats[2] = add_or_none( stats[2], t.execs  )
      self.weight,self.clks,self.execs = stats
   

class loop_desc( list ):
   def __init__( self, module, func, weight ):
      self.module = module
      self.func   = func
      self.weight = weight


class addr:
   def __init__( self, lid, ofs, path=None ): self.lid = int(lid); self.ofs = ofs

   def revise( self, modules ):
      if self.lid >= len(modules): return
      t = modules[self.lid]
      if t is None: return
      self.bse,self.mdl = t
      self.vrt = hex(int(self.bse,0)+int(self.ofs,16))[2:]
 
   def __format__( self, format_spec ): return split(self.mdl)[1]+'+0x'+self.ofs

class region(list):
   def __init__( self, addrs ): self.extend(addrs)

   def revise( self, modules ):
      for a in self:
         a.revise( modules )
      

class regions(list):
   def __init__( self ): self.modules = []; self.bases = dict(); self.focus = None
 
   def add_base( self, base, path ):
      self.bases[path] = base
      name = split(path)[1]
      if name != path: self.bases[name] = base

   def add_modl( self, lid, path ):
      idx = int(lid)
      while( len(self.modules) <= idx ): self.modules.append( None )
      if self.modules[idx] is not None: return
      base = self.bases.get(path)
      if base is None: base = self.bases.get(split(path)[1])
      if base is None: return
      self.modules[idx] = ( base, path )

   def find_lid( self, path ):
      lid = None
      for m in enumerate(self.modules):
         if m[1] is not None and m[1][1] == path: lid = m[0];break
      if lid is None:
         name = split(path)[1]
         for m in enumerate(self.modules):
            if m[1] is not None and split(m[1][1])[1] == name: lid = m[0];break
      if lid is None:
         lid = len(self.modules)
         self.add_modl( lid, path )
      return lid

   def revise( self, addrs ):
      self.focus = region( () )
      for i in range(4):
         rec = addrs[i].split('+')
         lid = self.find_lid( rec[0] )
         if rec[1].startswith('0x'): rec[1] = rec[1][2:]
         self.focus.append( addr(lid,rec[1]) )
      assert( len(self.focus) == 4 )
      self.focus.revise( self.modules )
         
      for r in self:
         r.revise( self.modules )
         if all( self.focus[i].vrt == r[i].vrt for i in range(4) ): self.focus = r

regions = regions()


class symbol:
   def __init__( self, name, module ): self.name, self.module = name, module
   def __lt__( self, other ): return self.name < other.name


cur_module = None


def xml_elem( name, attrs ):
   global cur_module
   if name.lower() == 'module':
      cur_module = attrs.get('path')
      if cur_module is None: cur_module = attrs['name']
      if cur_module is not None:
         base = attrs['base']
         if base is not None: regions.add_base( base, cur_module )
   if name.lower() == 'symbol': symbols[ int( attrs['base'], 0 ) ] = symbol( attrs['name'], cur_module )
  

threads = threads()

if len(sys.argv) > 2 and sys.argv[2].isdecimal():
   threads.focus = int(sys.argv[2])
   del sys.argv[2]
if len(sys.argv) > 2 and sys.argv[2][-1] == '%':
   work_share = float(sys.argv[2][:-1]) / 100;
   del sys.argv[2]
else: work_share = 1
if len(sys.argv) > 2 and sys.argv[2] == '-warmup':
   need_warmup = True
   del sys.argv[2]
else: need_warmup = False
if len(sys.argv) > 3 and sys.argv[2] == '-loops':
   all_loops = True
   skp_loops = 10
   fcs_loops = None if sys.argv[3] == 'all' else () if sys.argv[3] == 'none' else sys.argv[3].split( ',' )
   del sys.argv[2:4]
else:
   all_loops = False
   skp_loops = 1
   fcs_loops = None
if len(sys.argv) > 2 and sys.argv[2] == '-not-phases':
   perf_info.phases_prvd = False
   del sys.argv[2]
if len(sys.argv) > 2 and sys.argv[2] == '-full-output':
   fout_rqrd = True
   del sys.argv[2]
else: fout_rqrd = False
if len(sys.argv) > 2 and sys.argv[2] == '-stats':
   stats_rqrd = True
   del sys.argv[2]
else: stats_rqrd = False
if len(sys.argv) > 2 and sys.argv[2] == '--wgt-lst':
   all_loops = True
   skp_loops = 10
   list_req  = True
   del sys.argv[2]
else: list_req  = False
if len(sys.argv) > 3 and sys.argv[2] == '-fnd-paths':
   reg_list = sys.argv[3]
   del sys.argv[2:4]
else: reg_list = None
if len(sys.argv) > 3 and sys.argv[2] == '-distrib':
   dtr_rqrd = True
   dtr_cpi  = None if sys.argv[3] == 'keep-cpi' else float(sys.argv[3])
   del sys.argv[2:4]
else: dtr_rqrd,dtr_cpi = False,None
if len(sys.argv) > 3 and sys.argv[2] == '-inherit':
   dtr_name = sys.argv[3]
   del sys.argv[2:4]
else: dtr_name = None
if len(sys.argv) > 2 and sys.argv[2] == '-export':
   need_export = True
   del sys.argv[2]
else: need_export = False
if len(sys.argv) > 2 and sys.argv[2] == '-split':
   need_split = True
   del sys.argv[2]
else: need_split = False


symbols = ranges()
if len(sys.argv) > 2:
   with open( sys.argv[2], 'rb' ) as f:
      if splitext( sys.argv[2] )[1] == '.gz': f = gzip.GzipFile( fileobj=f )
      p                     = xml.parsers.expat.ParserCreate()
      cur_module            = None
      p.StartElementHandler = xml_elem
      p.ParseFile( f )

func_name = sys.argv[3] if len(sys.argv) > 3 else None


if reg_list is not None:
   re_rsep = re.compile( r'\s*:\s*' )
   addrs   = re_rsep.split( reg_list )
   if len(addrs) > 3:
      if len(addrs) > 4:
         #for line in fileinput.input( addrs[4] ):
         #   if line.startswith( '5:' ):
         #      rec = re_rsep.split( line.strip() )
         #      regions.append( region( ( addr(rec[25],rec[26]), addr(rec[27],rec[28]), addr(rec[29],rec[30]), addr(rec[31],rec[32]) ) ) )
         #   elif line.startswith( '3:' ):
         #      rec = re_rsep.split( line.strip() )
         #      if rec[11].upper() == 'LOAD' and 'x' in rec[12]:
         #         regions.add_modl(rec[1],rec[13])
         for line in fileinput.input( addrs[4] ):
            rec = re_rsep.split( line.strip() )
            regions.append( region( ( addr(regions.find_lid(rec[2]),rec[3]), addr(regions.find_lid(rec[4]),rec[5]), addr(regions.find_lid(rec[6]),rec[7]), addr(regions.find_lid(rec[8]),rec[9]) ) ) )
      regions.revise( addrs )
   

try:
   cur_thread = thread()
   threads.append( cur_thread )
   cur_instr  = None
   nxt_block  = None
   fsn_list   = []
   prv_list   = []
   nxt_list   = []


   re_thrd = re.compile( r"\s*(processor|core|thread)\s+(\d+)\s*", re.I )
   re_func = re.compile( r"\s*function\s+0x0*([\da-z]+)\s*(\[\s*([^\s]+)\s*\(\s*(.*\/)*([^\+]+)\+0x[\da-z]+\s*\)\])?$", re.I )
   re_evnt = re.compile( r"events\s*=\s*\(\s*(.+)\s*\)|(next_ip|prev_ip)\s*=\s*0x0*([\da-z]+)(\(mispred\))?", re.I )
   re_clee = re.compile( r"\s*callee\s+0x0*([\da-z]+)\s*:\s*calls\s*=\s*(\d+)", re.I )
   re_addr = re.compile( r"\s*0x0*([\da-z]+)\s*:", re.I )
   re_case = re.compile( r"\s*case\s+(.+):", re.I )
   re_wrds = re.compile( r"\W+" )
   re_summ = re.compile( r"\s*function\s+0x0*([\da-z]+)\s*summary", re.I )

   max_cid, max_tid = 0, 0
   for line in fileinput.input( sys.argv[1] ):
      s = re_thrd.match( line )
      if s is not None:
         if s.group(1).lower() == "processor": pid = int( s.group(2) )
         elif s.group(1).lower() == "core": cid = int( s.group(2) )
         else:
            tid = int( s.group(2) )
            max_cid, max_tid = max( max_cid, cid ), max( max_tid, tid )
            idx = tid + ( cid + pid * ( max_cid + 1 ) ) * ( max_tid + 1 )
            for e in range( idx - len( threads ) + 1 ): threads.append( thread() )
            cur_thread = threads[idx]
            nxt_block  = None
      s = re_func.match( line )
      if s is not None:
         name   = s.group(3)
         module = s.group(5)
         if ( name is None or module is None ) and symbols:
            sym = symbols[ int( s.group(1), 16 ) ]
            if name   is None: name   = sym.name
            if module is None: module = sym.module
         cur_func = None
         fs = cur_thread.funcs.modules.get( name )
         if fs is not None:
            cur_func = fs.get( module if module is not None else '' )
         cur_group = cur_thread.funcs.set_group( s.group(1) )
         if cur_func is None: cur_func = cur_thread.funcs.new( s.group(1), name, module )
         else: cur_group.append( cur_func )

      s = re_addr.match( line )
      if s is not None:
         if symbols:
            sym = symbols[ int( s.group(1), 16 ) ]
            if sym is not None and ( sym.name != cur_func.name or sym.module != cur_func.module ):
               cur_func = cur_thread.funcs.new( s.group(1), sym.name, sym.module )
         cur_instr = cur_thread.instrs.new( s.group(1), line )
         for addr, perf in fsn_list: # macro fusion: adjust for jump
            if perf.first is not None: perf.first += 1
            if perf.last is not None: perf.last += 1
            cur_instr.execs += perf.execs
            cur_instr.revise( None, perf )
            cur_block.nexts[addr] += perf
         del fsn_list[:]
         del prv_list[:]
         del nxt_list[:]
         if nxt_block is not None: nxt_block.append( cur_instr )
         else: nxt_block = cur_thread.blocks.new( cur_instr, cur_func )
         cur_block = nxt_block
         cur_func.instrs.append( cur_instr ) #!!! targeted for delete

      s = re_case.match( line )
      if s is not None:
         cur_perf = perf_info( line )
         cur_instr.revise( None, cur_perf )
         for c in re_evnt.finditer( s.group(1) ):
            if c.group(0).lower().startswith( "events" ):
               for e in re_wrds.split( c.group(1) ): cur_instr[ e.lower() ].add( cur_perf )
            elif c.group(0).lower().startswith( "prev_ip" ):
               if cur_perf.execs == 1 and cur_perf.first is not None and cur_perf.first < 150 and int( c.group(3), 16 ) < 0x300 and int( cur_instr.start, 16 ) >= 0x300: continue # start-up iret
               if cur_block[0] is not cur_instr:
                  if cur_block[-2].start.lower() == c.group(3).lower(): prv_list.append( ( c.group(3), cur_perf ) ); continue
                  cur_block.remove( cur_instr )
                  cur_block = cur_thread.blocks.new( cur_instr, cur_func )
                  for addr, perf in prv_list: cur_block.prevs[addr] += perf
                  del prv_list[:]
                  if nxt_block is not None: nxt_block = cur_block
               cur_block.prevs[ c.group(3) ] += cur_perf
            elif c.group(0).lower().startswith( "next_ip" ):
               if nxt_block is not None and int( cur_instr.start, 16 ) + cur_instr.size == int( c.group(3), 16 ):
                  nxt_list.append( ( c.group(3), cur_perf ) ); continue
               if any( cur_instr.cmd.lower().startswith( i ) for i in ( "cmp", "and", "test", "or" ) ): fsn_list.append( ( c.group(3), cur_perf ) ); continue # macro fusion: keep for jump's adjust
               if cur_perf.execs == 1 and cur_perf.first is not None and cur_perf.first < 150 and int( c.group(3), 16 ) >= 0x300 and int( cur_instr.start, 16 ) < 0x300: continue # start-up iret
               cur_block.nexts[ c.group(3) ] += cur_perf
               nxt_block = None
               for addr, perf in nxt_list: cur_block.nexts[addr] += perf
               del nxt_list[:]
      s = re_summ.match( line )
      if s is not None:
         assert s.group(1) == "5" or cur_group.start == s.group(1)
         cur_group.perf = perf_info( line )
         cur_instr      = None
      s = re_clee.match( line )
      if s is not None: cur_group.callee[ s.group(1) ] = int( s.group(2) )
      if cur_instr is not None: cur_instr.text += line
except:
   print( line )
   raise

if dtr_name is not None:
   cur_thread = threads[0]
   for line in fileinput.input( dtr_name ):
      line.strip()
      if line.startswith( '#' ):
         if ':' in line:
            pass # module:base -- later
         else:
            rec = line[1:].split()
            if rec[-1].startswith( 'tid' ):
               idx = int(rec[-1][3:])
               cur_thread = threads[idx] if idx < len(threads) else None
      elif cur_thread is not None:
         rec = line.split()
         if len(rec) >= 3:
            if cur_thread.blocks.distrib is None: cur_thread.blocks.distrib = []
            cur_thread.blocks.distrib.append( ( int(rec[0],16), int(rec[1]), int(rec[2]) ) )
   for t in threads:
      if t.blocks.distrib is not None: t.blocks.distrib.sort()

threads.revise( func_name, fcs_loops )
thread = threads[threads.focus]

print( ';*** Functions profile: (addr module [share%] name #instrs weight)' )
for f in sorted( ( f for f in thread.funcs.values() if f.weight is not None ), reverse = True ): print( f.start, split( f.module )[1] if f.module is not None else None, "[{:.1%}]".format( f.weight / thread.funcs.weight ), f.name, f.instn, f.weight )

if fout_rqrd and thread.instrs:
   print( '' )
   max_clks = max( thread.instrs, key=lambda i: i.clks if i.clks is not None else 0 )
   if max_clks.clks is not None:
      print( "\n;*** Highest #clks:" )
      print( max_clks.text, end='' )

   # suspicious instructions are ones which have close to max_clks #instrs and high events rate
   
   for e in ( 'dcu_miss', 'l2_miss', 'dtlb_miss', 'l1d_stl_known', 'l1d_stl_unkwn', 'itlb_miss', 'jeclear' ):
      slw_inst = max( thread.instrs, key=lambda i: i[e].clks if e in i else 0 )
      if e in slw_inst:
         print( "\n;*** Highest #clks due to {}:".format( e ) )
         print( slw_inst.text, end='' )
      if e in { 'itlb_miss', 'jeclear' }: continue
      max_inst = max( thread.instrs, key=lambda i: i[e].execs if e in i else 0 )
      if e in max_inst:
         if max_inst is not slw_inst:
            print( "\n;*** Highest #{}:".format( e ) )
            print( max_inst.text, end='' )
         rti_inst = max( filter( lambda i: i.execs / max_inst.execs > 0.5, thread.instrs ), key=lambda i: i[e].execs / i.execs if e in i else 0 )
         if rti_inst is not slw_inst and rti_inst is not max_inst:
            print( "\n;*** Highest {} rate ( filtered by #instrs ):".format( e ) )
            print( rti_inst.text, end='' )
      if e in slw_inst:
         rtc_inst = max( filter( lambda i: i.clks / slw_inst.clks > 0.5, thread.instrs ), key=lambda i: i[e].execs / i.execs if e in i else 0 )
         if rtc_inst is not slw_inst and rtc_inst is not max_inst and rtc_inst is not rti_inst:
            print( "\n;*** Highest {} rate ( filtered by #clks ):".format( e ) )
            print( rtc_inst.text, end='' )

if fout_rqrd and thread.blocks.triads:
   print( "\n\n;*** Triad with highest weight  ", end='' )
   thread.blocks.triads[-1].print( thread.blocks )
   
for i, l in enumerate( thread.blocks.loops, 1 ):
   print( "\n\n;*** Loop with highest weight   " if i == 1 else "\n\n;*** Loop {:2}                    ".format( i ), end='' )
   l.print( thread.blocks )
 
if fout_rqrd:
   print( "\n\n;*** Empty prevs:" )
   for b in filter( lambda b: len( b.prevs ) == 0, thread.blocks ): b.print( thread.blocks )
   print( "\n;*** Empty nexts:" )
   for b in filter( lambda b: len( b.nexts ) == 0, thread.blocks ): b.print( thread.blocks )

if thread.blocks.loops: print( "\n\n;*** Pipetrace generation advice:" )
for l in thread.blocks.loops:
   cnt = round( l.tc * l.ipi )
   if cnt * 0.7 >= 5000:
      print( pow( 2, ceil( log( l.tc / ( 5000 / 0.7 / l.ipi ), 2 ) ) ) )
      cnt = round( l.tc * l.ipi / pow( 2, ceil( log( l.tc / ( 5000 / 0.7 / l.ipi ), 2 ) ) ) )
   if not l.prevs:
      first = min( i.first - l[0].index( i ) for i in l[0] if i.first is not None )
   else:
      prlg  = min( l.prevs, key=lambda b: b.first )
      first = max( i.first - prlg.index( i ) + len(prlg) for i in prlg if i.first is not None )
   skip_cnt  = round( cnt * 0.1 )                  # 10% as warmup of caches and TLBs
   exec_cnt  = cnt - skip_cnt - round( cnt * 0.1 ) # 10% at the end isn't included
   pipe_cnt  = round( cnt * 0.7 )                  # 70% as pipe trace ( 10% to warmup LSD, DSB, etc. )
   skip_cnt += first
   clrs_cnt  = min( skip_cnt, 100000 )             # ~100K instrs to warmup core itself
   skip_cnt -= clrs_cnt                            # we use for that part of warmup
   exec_cnt += clrs_cnt
   if len(threads) == 1: print( "-s {} -e {} -clear_stats {} -pipe_start_from_end {}".format( skip_cnt, exec_cnt, clrs_cnt, pipe_cnt ) )
   else: print( "-e {} -clear_stats {} -pipe_start_from_end {}".format( skip_cnt + exec_cnt, skip_cnt + clrs_cnt, pipe_cnt ) )

if need_warmup:
   first =      min( min( min( b.first for b in l.prevs ), l[ 0].first - 1 ) if l.prevs else l.first - 1 for l in thread.blocks.loops )
   last  = min( max( max( max( b.last  for b in l.nexts ), l[-1].last  + 1 ) if l.nexts else l.last  + 1 for l in thread.blocks.loops ), thread.blocks.last )

   if last > 300e6:
      first = round( last / 361 * 300 )
   warmup_share = work_share / 2 if work_share <= 0.1 else 0.1 # 10% as warmup of caches and TLBs or half of work share if <= 10%
   skip_cnt  = first + round( ( last - first ) * warmup_share )
   exec_cnt  = round( ( last - skip_cnt ) * work_share )
   clrs_cnt  = min( skip_cnt, 100000 ) # ~100K instrs to warmup core itself
   skip_cnt -= clrs_cnt                # we use for that part of warmup
   exec_cnt += clrs_cnt
   print( "\n\n;*** Warmup & Simulation advice:" )
   if len(threads) == 1: print( "-s {} -e {} -clear_stats {}".format( skip_cnt, exec_cnt, clrs_cnt ) )
   else: print( "-e {} -clear_stats {}".format( skip_cnt + exec_cnt, skip_cnt + clrs_cnt ) )
else:
   exec_cnt = thread.blocks.last
   clrs_cnt = exec_cnt // 2 if exec_cnt < 200000 else 100000
   print( "\n\n;*** Warmup & Simulation advice:\n-e {} -clear_stats {}".format( exec_cnt, clrs_cnt ) )

if dtr_rqrd:
   if dtr_cpi is None or threads.clks is None: cpi_scale = 1.0
   else:
      cpi_bscl  = 1.0
      cpi_merr  = abs( float(threads.clks)/threads.execs - dtr_cpi )
      cpi_scale = dtr_cpi*threads.execs/threads.clks
      for k in range(3):
         adj_clks = sum( int(0.5+cpi_scale*i.clks) for t in threads for b in t.blocks for i in b if i.clks is not None )
         adj_err  = abs( float(adj_clks)/threads.execs - dtr_cpi )
         if adj_err < cpi_merr: cpi_bscl,cpi_merr = cpi_scale,adj_err
         cpi_scale = cpi_scale*dtr_cpi*threads.execs/adj_clks
      cpi_scale = cpi_bscl
   print( "\n\n;*** Distribution:" )
   for i,t in enumerate(threads):
      print( '#{}'.format(1), end='' )
      if t.blocks.clks is not None: print( ' {}'.format(1), end='' )
      print( ' tid{}'.format(i) )
      print( '#:0' )
      for b in t.blocks: b.export( cpi_scale )
if need_export:
   print( "\n\n;*** Export:" )
   for l in thread.blocks.loops: l.export( thread.blocks )
 
if need_split:
   print( "\n\n;*** Split:" )
   for l in thread.blocks.loops: print( '-s {} -e {}'.format( l.enter, l.exit-l.enter-1 ) )

if regions.focus is not None:
   print( "\n\n;*** Regions:" )
   for a in regions.focus:
      b = thread.blocks.rngs[rng(a.vrt)]
      if not int(b.start,16) <= int(a.vrt,16) <= int(b.end,16):
         print( a.vrt )
      #else:
      #   b.call.print( thread.blocks )
   if all( int(b.start,16) <= int(a.vrt,16) <= int(b.end,16) for a in regions.focus for b in ( thread.blocks.rngs[rng(a.vrt)], ) ):
      #for s in ( b.start for r in regions for a in ( r[1], r[3] ) for b in ( thread.blocks.rngs[rng(a.vrt)], ) if int(b.start,16) <= int(a.vrt,16) <= int(b.end,16) ): print(s)
   
      mpi_calls = frozenset( e.target for r in regions for a in ( r[0], r[2] ) for b in ( thread.blocks.rngs[rng(a.vrt)], ) if int(b.start,16) <= int(a.vrt,16) <= int(b.end,16) for e in b.call.nexts.values() )

      #for b in mpi_calls: b.print( thread.blocks )
   
      m1 = thread.blocks.rngs[rng(regions.focus[0].vrt)]
      u1 = thread.blocks.rngs[rng(regions.focus[1].vrt)]
      m2 = thread.blocks.rngs[rng(regions.focus[2].vrt)]
      u2 = thread.blocks.rngs[rng(regions.focus[3].vrt)]
      
      p     = path( prt_alwd=True, stops=mpi_calls, reset=True )
      p1    = p.find_path_run( ( u1, m1 ) )
      p2,p3 = None,None
      #print( 'p1:', p1 )
      if p1 is not None:
         p2 = p.find_path_run( ( u2.call, u1 ), pcl_alwd=True, reset=True )
         #print( 'p2:', p2 )
         if p2 is not None: p3 = p.find_path_run( ( m2.call, u2.call ), reset=True )
         else:              p3 = None
         if p3 is None:
            p2 = p.find_path_run( ( m2.call, u1 ), reset=True )
            if p2 is None: p1 = None
      if p1 is None:
         p1 = p.find_path_run( ( u2.call, m1 ), pcl_alwd=True, reset=True )
         #print( 'p1:', p1 )
         if p1 is not None: p2 = p.find_path_run( ( m2.call, u2.call ), reset=True )
         else:              p2 = None
         #print( 'p2:', p2 )
         if p2 is None: p1 = p.find_path_run( ( m2.call, m1 ), reset=True )
      if p1 is not None:
         #t = p.find_path_run( ( m1.call, u1.call ), pcl_alwd=True, prt_alwd=True, stops=frozenset(), reset=True )
         #if t is not None:
         #   p[:] = t[1:]
         #   p.append( t[0] )
         #   mine1 = p.revise( thread.blocks )
         #   p.print( thread.blocks, True )
         #else: mine1 = None
         #t = p.find_path_run( ( m2.call, u2.call ), pcl_alwd=True, prt_alwd=True, stops=frozenset(), reset=True )
         #if t is not None:
         #   p[:] = t[1:]
         #   p.append( t[0] )
         #   mine2 = p.revise( thread.blocks )
         #   p.print( thread.blocks, True )
         #else: mine2 = None
         #mine = min_or_none( mine1, mine2 )
         #mine = p1[0].execs
         #print( mine )
         del p[:]
         for t in ( p1, p2, p3 ):
            if t is None: break
            p.extend( t[1:] )
         p.append( m2.call )
         p.revise_single( thread.blocks )
         #p.revise( thread.blocks )
         p.print( thread.blocks, True )
         print( "\n\n;*** Export:" )
         print( "{:.1f}:{}:{}:{}:{}:{:.3f}:{:.1f}:{:.1f}".format( p.instn, regions.focus[0], regions.focus[1], regions.focus[2], regions.focus[3], float(p.clks)/p.execs, float(p.clks)/p.instn, float(p.execs)/p.instn ) )
 

if list_req:
   print( "\n\n;*** List:" )
   lsd = dict()
   for t in threads:
      for l in t.blocks.loops:
         fd = None
         bs = []
         for b in l.body():
            if fd is None:
               for f in t.funcs.values():
                  if b in f: fd = f; break
            bs.append( ( int( b[0].start, 16 ), int( b[-1].start, 16) - int( b[0].start, 16 ) + b[-1].size ) )
         if fd is not None:
            ld = lsd.setdefault( (bs[0][0],bs[-1][0],bs[0][1],bs[-1][1]), loop_desc( fd.module, fd.name, l.weight ) )
            if ld:
               ld[:] = [ (a,s) for (a,s) in ld if (a,s) in bs ]
               ld.weight += l.weight
            else:
               ld[:] = bs
   for ld in sorted(lsd.values(),key=lambda v:v.weight,reverse=True):
      print( ld.weight, ld.module, ld.func, end='' )
      for bd in ld:
         print( ' {}:{}'.format( bd[0], bd[1] ), end='' )
      print( '' )


if stats_rqrd:
   print( "\n\n;*** Stats:" )
   print( "instrs_retired\t{}".format( thread.blocks.execs ) )
   if thread.blocks.clks is not None:
      print( "cycles\t{}\ncpi\t{:.5f}".format( thread.blocks.clks, float(thread.blocks.clks)/thread.blocks.execs ) )


sys.exit( 0 )
